This script calculates the correlations between different cell entry selections. Either the median of all LibA selections vs the median of all LibB selections, OR all selections for a specific condition.¶
In [1]:
# this cell is tagged as parameters for `papermill` parameterization
altair_config = None
nipah_config = None
codon_variants_file = None
CHO_EFNB2_corr_plot_save = None
CHO_EFNB3_corr_plot_save = None
CHO_EFNB2_indiv_plot_save = None
CHO_EFNB3_indiv_plot_save = None
histogram_plot = None
In [2]:
# Parameters
nipah_config = "nipah_config.yaml"
altair_config = "data/custom_analyses_data/theme.py"
codon_variants_file = "results/variants/codon_variants.csv"
CHO_EFNB2_corr_plot_save = "results/images/CHO_EFNB2_library_corr.html"
CHO_EFNB3_corr_plot_save = "results/images/CHO_EFNB3_library_corr.html"
CHO_EFNB2_indiv_plot_save = "results/images/CHO_EFNB2_all_corrs.html"
CHO_EFNB3_indiv_plot_save = "results/images/CHO_EFNB3_all_corrs.html"
histogram_plot = "results/images/variants_histogram.html"
In [3]:
import math
import os
import re
import altair as alt
import numpy as np
import pandas as pd
import scipy.stats
import Bio.SeqIO
import yaml
from Bio import AlignIO
from Bio import PDB
from Bio.Align import PairwiseAligner
In [4]:
# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()
if os.getcwd() == '/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/':
pass
print("Already in correct directory")
else:
os.chdir("/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/")
print("Setup in correct directory")
Setup in correct directory
In [5]:
#altair_config = 'data/custom_analyses_data/theme.py'
#nipah_config = 'nipah_config.yaml'
#codon_variants_file = 'results/variants/codon_variants.csv'
#CHO_EFNB2_corr_plot_save
#CHO_EFNB3_corr_plot_save
#CHO_EFNB2_indiv_plot_save
#CHO_EFNB3_indiv_plot_save
In [6]:
if altair_config:
with open(altair_config, 'r') as file:
exec(file.read())
with open(nipah_config) as f:
config = yaml.safe_load(f)
with open('data/func_effects_config.yml') as y:
config_func = yaml.safe_load(y)
In [7]:
cho_efnb2_low_selections = config_func['avg_func_effects']['CHO_EFNB2_low']['selections']
LibA_CHO_EFNB2 = [selection + '_func_effects.csv' for selection in cho_efnb2_low_selections if 'LibA' in selection and 'LibB' not in selection]
LibB_CHO_EFNB2 = [selection + '_func_effects.csv' for selection in cho_efnb2_low_selections if 'LibB' in selection and 'LibA' not in selection]
cho_efnb3_low_selections = config_func['avg_func_effects']['CHO_EFNB3_low']['selections']
LibA_CHO_EFNB3 = [selection + '_func_effects.csv' for selection in cho_efnb3_low_selections if 'LibA' in selection and 'LibB' not in selection]
LibB_CHO_EFNB3 = [selection + '_func_effects.csv' for selection in cho_efnb3_low_selections if 'LibB' in selection and 'LibA' not in selection]
Calculate correlations for LibA and LibB for CHO-EFNB2 cell entry selections¶
In [8]:
path = 'results/func_effects/by_selection/'
def process_func_selections(library,library_name):
df_list = []
clock = 1
for file_name in library:
file_path = os.path.join(path, file_name)
fixed_name = file_name.replace('_func_effects.csv', '')
# Read the current CSV file
func_scores = pd.read_csv(file_path)
func_scores_renamed = func_scores.rename(columns={'functional_effect': f'functional_effect_{clock}','times_seen': f'times_seen_{clock}'})
func_scores_renamed.drop(['latent_phenotype_effect'],axis=1,inplace=True)
# Append the dataframe to the list
df_list.append(func_scores_renamed)
clock = clock + 1
# Merge all dataframes on 'site' and 'mutant'
merged_df = df_list[0]
for df in df_list[1:]:
merged_df = pd.merge(merged_df, df, on=['site', 'mutant','wildtype'], how='outer')
#Calculate median values
lib_columns_func = [col for col in merged_df.columns if 'functional_effect' in col]
merged_df[f'median_effect_{library_name}'] = merged_df[lib_columns_func].median(axis=1)
lib_columns_times_seen = [col for col in merged_df.columns if 'times_seen' in col]
merged_df[f'median_times_seen_{library_name}'] = merged_df[lib_columns_times_seen].median(axis=1)
#Now drop columns
lib_columns = [col for col in merged_df.columns if re.search(r'_\d+', col)]
merged_df = merged_df.drop(columns=lib_columns)
return merged_df
A_selections_E2 = process_func_selections(LibA_CHO_EFNB2,'LibA')
B_selections_E2 = process_func_selections(LibB_CHO_EFNB2,'LibB')
A_selections_E3 = process_func_selections(LibA_CHO_EFNB3,'LibA')
B_selections_E3 = process_func_selections(LibB_CHO_EFNB3,'LibB')
def merge_selections(A_selections,B_selections):
merged_selections = pd.merge(A_selections,B_selections,on=['wildtype','site','mutant'],how='inner')
return merged_selections
CHO_EFNB2_merged = merge_selections(A_selections_E2,B_selections_E2)
CHO_EFNB3_merged = merge_selections(A_selections_E3,B_selections_E3)
def make_chart_median(df,title):
empty = []
variant_selector = alt.selection_point(
on="mouseover",
empty=False,
fields=["site","mutant"],
value=1
)
for number in [config['func_times_seen_cutoff']]:
df = df[
(df['median_times_seen_LibA'] >= number) &
(df['median_times_seen_LibB'] >= number) &
(df['median_effect_LibA'].notna()) &
(df['median_effect_LibB'].notna())
]
size = df.shape[0]
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(df[f'median_effect_LibA'], df[f'median_effect_LibB'])
r_value = float(r_value)
print(f'{r_value:.2f}')
chart = alt.Chart(df,title=f'r={r_value:.2f}').mark_circle(size=20, color='black', opacity=0.15).encode(
x=alt.X('median_effect_LibA',title='LibA Cell Entry'),
y=alt.Y('median_effect_LibB',title='LibB Cell Entry'),
tooltip=['site','wildtype','mutant','median_times_seen_LibA','median_times_seen_LibB'],
size=alt.condition(variant_selector, alt.value(100),alt.value(20)),
color=alt.condition(variant_selector, alt.value('orange'), alt.value('black')),
opacity=alt.condition(variant_selector, alt.value(1), alt.value(0.15)),
strokeWidth=alt.condition(variant_selector,alt.value(5),alt.value(0)),
).properties(
height=400,
width=400
).add_params(variant_selector)
empty.append(chart)
combined_effect_chart = alt.hconcat(*empty).resolve_scale(y='shared', x='shared', color='independent')
return combined_effect_chart
CHO_EFNB2_corr_plot = make_chart_median(CHO_EFNB2_merged,'CHO-EFNB2')
CHO_EFNB2_corr_plot.display()
CHO_EFNB2_corr_plot.save(CHO_EFNB2_corr_plot_save)
CHO_EFNB3_corr_plot = make_chart_median(CHO_EFNB3_merged,'CHO-EFNB3')
CHO_EFNB3_corr_plot.display()
CHO_EFNB3_corr_plot.save(CHO_EFNB3_corr_plot_save)
0.93
0.92
Make correlations between individual selections¶
In [9]:
#path = 'results/func_effects/by_selection/'
def process_individ_selections(library):
df_list = []
clock = 1
for file_name in library:
file_path = os.path.join(path, file_name)
print(f"Processing file: {file_path}")
#fixed_name = file_name.replace('_func_effects.csv', '')
# Read the current CSV file
func_scores = pd.read_csv(file_path)
#display(func_scores.head(2))
#func_scores = func_scores[func_scores['times_seen'] >= config['func_times_seen_cutoff']]
func_scores_renamed = func_scores.rename(columns={'functional_effect': f'functional_effect_{clock}','times_seen': f'times_seen_{clock}'})
func_scores_renamed.drop(['latent_phenotype_effect'],axis=1,inplace=True)
# Append the dataframe to the list
df_list.append(func_scores_renamed)
clock = clock + 1
# Merge all dataframes on 'site' and 'mutant'
merged_df = df_list[0]
for df in df_list[1:]:
merged_df = pd.merge(merged_df, df, on=['site', 'mutant','wildtype'], how='outer')
# Make list of how many selections are done for later correlation plots
lib_size = len(library)
number_list = [i for i in range(1, lib_size+1)]
return merged_df,number_list
CHO_EFNB2_indiv,lib_size_EFNB2 = process_individ_selections(LibA_CHO_EFNB2+LibB_CHO_EFNB2)
CHO_EFNB3_indiv,lib_size_EFNB3 = process_individ_selections(LibA_CHO_EFNB3+LibB_CHO_EFNB3)
def make_chart(df,number_list):
empty_list = []
for i in number_list:
other_empty_list = []
for j in number_list:
df = df[
(df[f'times_seen_{i}'] >= config['func_times_seen_cutoff']) &
(df[f'times_seen_{j}'] >= config['func_times_seen_cutoff']) &
(df[f'functional_effect_{i}'].notna()) &
(df[f'functional_effect_{j}'].notna())
]
#slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(df[f'functional_effect_{i}'], df[f'functional_effect_{j}'])
#r_value = float(r_value)
#print(f'{r_value:.2f}')
chart = alt.Chart(df).mark_circle(size=10, color='black', opacity=0.2).encode(
x=alt.X(f'functional_effect_{i}'),
y=alt.Y(f'functional_effect_{j}'),
tooltip=['site','wildtype','mutant'],
).properties(
height=alt.Step(10),
width=alt.Step(10)
)
other_empty_list.append(chart)
combined_effect_chart = alt.hconcat(*other_empty_list).resolve_scale(y='shared', x='shared', color='independent')
empty_list.append(combined_effect_chart)
final_combined_chart = alt.vconcat(*empty_list).resolve_scale(y='shared', x='shared', color='independent')
return final_combined_chart
CHO_EFNB2_indiv_plot = make_chart(CHO_EFNB2_indiv,lib_size_EFNB2)
#CHO_EFNB2_indiv_plot.display()
CHO_EFNB2_indiv_plot.save(CHO_EFNB2_indiv_plot_save)
CHO_EFNB3_indiv_plot = make_chart(CHO_EFNB3_indiv,lib_size_EFNB3)
CHO_EFNB3_indiv_plot.save(CHO_EFNB3_indiv_plot_save)
Processing file: results/func_effects/by_selection/LibA-231112-CHO-EFNB2-BA6-nac_func_effects.csv Processing file: results/func_effects/by_selection/LibA-231207-CHO-EFNB2-BA6-1_func_effects.csv Processing file: results/func_effects/by_selection/LibA-231207-CHO-EFNB2-BA6-2_func_effects.csv Processing file: results/func_effects/by_selection/LibA-231207-CHO-EFNB2-BA6-3_func_effects.csv Processing file: results/func_effects/by_selection/LibA-231207-CHO-EFNB2-BA6-pool_func_effects.csv Processing file: results/func_effects/by_selection/LibA-231222-CHO-EFNB2-BA6-nac_diffVSV_func_effects.csv Processing file: results/func_effects/by_selection/LibB-231112-CHO-EFNB2-BA6-nac_diff_VSV_func_effects.csv Processing file: results/func_effects/by_selection/LibB-231116-CHO-BA6_PREV_POOL_func_effects.csv Processing file: results/func_effects/by_selection/LibA-230725-CHO-EFNB3-C6-nac-diffVSV_func_effects.csv Processing file: results/func_effects/by_selection/LibA-230916-CHO-EFNB2-BA6-nac_diffVSV_func_effects.csv Processing file: results/func_effects/by_selection/LibA-231024-CHO-EFNB3-C6-nac_func_effects.csv Processing file: results/func_effects/by_selection/LibB-230720-CHO-C6-nac-VSV_func_effects.csv Processing file: results/func_effects/by_selection/LibB-230815-CHO-C6-nac_func_effects.csv Processing file: results/func_effects/by_selection/LibB-230818-CHO-C6-nac_func_effects.csv Processing file: results/func_effects/by_selection/LibB-231116-CHO-C6_PREV_POOL_func_effects.csv
Now make histogram of variants¶
In [10]:
codon_variants = pd.read_csv(codon_variants_file)
unique_barcodes_per_library = codon_variants.groupby('library')['barcode'].nunique()
display(unique_barcodes_per_library)
library LibA 72252 LibB 52162 Name: barcode, dtype: int64
In [11]:
def calculate_fraction(library):
total_A = codon_variants[codon_variants['library'] == library].shape[0]
for number in range(5):
fraction = codon_variants[(codon_variants['library'] == library) & (codon_variants['n_aa_substitutions'] == number)].shape[0]
print(f'For {library}, the fraction of sites with {number} mutations relative to WT are: {fraction/total_A:.2f}')
calculate_fraction('LibA')
calculate_fraction('LibB')
For LibA, the fraction of sites with 0 mutations relative to WT are: 0.11 For LibA, the fraction of sites with 1 mutations relative to WT are: 0.64 For LibA, the fraction of sites with 2 mutations relative to WT are: 0.21 For LibA, the fraction of sites with 3 mutations relative to WT are: 0.03 For LibA, the fraction of sites with 4 mutations relative to WT are: 0.00 For LibB, the fraction of sites with 0 mutations relative to WT are: 0.11 For LibB, the fraction of sites with 1 mutations relative to WT are: 0.65 For LibB, the fraction of sites with 2 mutations relative to WT are: 0.20 For LibB, the fraction of sites with 3 mutations relative to WT are: 0.03 For LibB, the fraction of sites with 4 mutations relative to WT are: 0.00
In [12]:
def plot_histogram(df):
df = df.drop(columns=['barcode','target','variant_call_support','codon_substitutions','aa_substitutions','n_codon_substitutions'])
chart = alt.Chart(df).mark_bar(color='black').encode(
alt.X("n_aa_substitutions:N",title='# of AA Substitutions'),
alt.Y('count()', title='Count'), # count() is a built-in aggregation to count rows in each bin
column=alt.Column('library',title=None)
).properties(
width=300,
height=300
)
return chart
histogram = plot_histogram(codon_variants)
histogram.display()
histogram.save(histogram_plot)